While easy to implement and interpret, regression can have poor performance in non-linear settings
Data obtention, splitting, exploratory data analysis
library(caret)
## Loading required package: lattice
## Loading required package: ggplot2
data(faithful)
set.seed(333)
inTrain<-createDataPartition(y = faithful$waiting, p = 0.5, list = F)
trainFaith<-faithful[inTrain, ]
testFaith<-faithful[-inTrain, ]
head(trainFaith)
## eruptions waiting
## 6 2.883 55
## 11 1.833 54
## 16 2.167 52
## 19 1.600 52
## 22 1.750 47
## 27 1.967 55
plot(x = trainFaith$waiting, y = trainFaith$eruptions, pch=19, col="blue", xlab="Waiting", ylab="Duration")
Building the linear model
# fit linear model
lm1<-lm(formula = eruptions ~ waiting, data = trainFaith)
summary(lm1)
##
## Call:
## lm(formula = eruptions ~ waiting, data = trainFaith)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.2699 -0.3479 0.0398 0.3659 1.0502
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -1.79274 0.22787 -7.87 1e-12 ***
## waiting 0.07390 0.00315 23.47 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.495 on 135 degrees of freedom
## Multiple R-squared: 0.803, Adjusted R-squared: 0.802
## F-statistic: 551 on 1 and 135 DF, p-value: <2e-16
plot(x = trainFaith$waiting, y = trainFaith$eruptions, pch=19, col="blue", xlab="Waiting", ylab="Duration")
lines(x = trainFaith$waiting, lm1$fitted, lwd=3, col="red")
Predicting a new value
newdata<-data.frame(waiting=80)
predict(object = lm1, newdata = newdata)
## 1
## 4.119
Plotting predictions on the training and test sets
par(mfrow=c(1,2))
plot(x = trainFaith$waiting,
y = trainFaith$eruptions,
xlab = "Waiting",
ylab = "Duration",
main = "Training Data",
pch=19, col="blue")
lines(x = trainFaith$waiting,
y = predict(object = lm1),
lwd=3, col="red")
plot(x = testFaith$waiting,
y = testFaith$eruptions,
xlab = "Waiting",
ylab = "Duration",
main = "Testing Data",
pch=19, col="blue")
lines(x = testFaith$waiting,
y = predict(object = lm1, newdata = testFaith),
lwd=3, col="red")
Get the training and test set errors
# get root mean squared error on the training set
sqrt(
sum(
(lm1$fitted-trainFaith$eruptions)^2
)
)
## [1] 5.752
# get root mean squared error on the training set
sqrt(
sum(
(predict(object = lm1, newdata = testFaith)-testFaith$eruptions)^2
)
)
## [1] 5.839
Get Prediction Intervals
pred1<-predict(object = lm1, newdata = testFaith, interval = "prediction")
# order the values for the test data set
ord<-order(testFaith$waiting)
plot(x = testFaith$waiting, y = testFaith$eruptions, pch=19, col= "blue")
# plot confidence intervals
matlines(x = testFaith$waiting[ord], pred1[ord, ], type = "l", lwd=3)
Doing it all in caret
modFit<-train(eruptions ~ waiting, data = trainFaith, method = "lm")
summary(modFit$finalModel)
##
## Call:
## lm(formula = .outcome ~ ., data = dat)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.2699 -0.3479 0.0398 0.3659 1.0502
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -1.79274 0.22787 -7.87 1e-12 ***
## waiting 0.07390 0.00315 23.47 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.495 on 135 degrees of freedom
## Multiple R-squared: 0.803, Adjusted R-squared: 0.802
## F-statistic: 551 on 1 and 135 DF, p-value: <2e-16
Data obtention, splitting and exda
library(ISLR)
library(ggplot2)
data(Wage)
#subset the data set to the part that isn't the variable that we're trying to predict
Wage<-subset(Wage, select = -c(logwage))
summary(Wage)
## year age sex maritl
## Min. :2003 Min. :18.0 1. Male :3000 1. Never Married: 648
## 1st Qu.:2004 1st Qu.:33.8 2. Female: 0 2. Married :2074
## Median :2006 Median :42.0 3. Widowed : 19
## Mean :2006 Mean :42.4 4. Divorced : 204
## 3rd Qu.:2008 3rd Qu.:51.0 5. Separated : 55
## Max. :2009 Max. :80.0
##
## race education region
## 1. White:2480 1. < HS Grad :268 2. Middle Atlantic :3000
## 2. Black: 293 2. HS Grad :971 1. New England : 0
## 3. Asian: 190 3. Some College :650 3. East North Central: 0
## 4. Other: 37 4. College Grad :685 4. West North Central: 0
## 5. Advanced Degree:426 5. South Atlantic : 0
## 6. East South Central: 0
## (Other) : 0
## jobclass health health_ins wage
## 1. Industrial :1544 1. <=Good : 858 1. Yes:2083 Min. : 20.1
## 2. Information:1456 2. >=Very Good:2142 2. No : 917 1st Qu.: 85.4
## Median :104.9
## Mean :111.7
## 3rd Qu.:128.7
## Max. :318.3
##
inTrain<-createDataPartition(y = Wage$wage, p = 0.7, list=F)
training<-Wage[inTrain, ]
testing<-Wage[-inTrain, ]
c(dim(training), dim(testing))
## [1] 2102 11 898 11
# view feature plot
featurePlot(x = training[,c("age", "education", "jobclass")], y = training$wage, plot = "pairs")
# more exda plots
qplot(x = age, y = wage, data = training, color= jobclass)
qplot(x = age, y = wage, data = training, color= factor(education))
Fitting a multicovariate linear relationship
modFit<-train(wage ~ age + jobclass + education, method = "lm", data = training)
finMod<-modFit$finalModel
finMod
##
## Call:
## lm(formula = .outcome ~ ., data = dat)
##
## Coefficients:
## (Intercept) age
## 60.997 0.519
## `jobclass2. Information` `education2. HS Grad`
## 5.245 11.347
## `education3. Some College` `education4. College Grad`
## 23.762 36.754
## `education5. Advanced Degree`
## 62.420
Diagnostics
#residuals vs fitted
plot(x = finMod, 1)
# colour by vaules not used in the model, to make sure that these are all on the 0 line
qplot(x = finMod$fitted, finMod$residuals, colour=race, data=training)
# Very important!!! Plot by index
plot(x = finMod$residuals, pch=19)
Plotting the fitted residuals versus the index shows the high residuals at the right, and you can also see a trend wrt the row numbers. Whenever you see a trend with respect to row numbers, it suggest that there is a varaible missing from your model!!
# plot predicted vs truth in test set. ideally they should be close
pred<-predict(object = modFit, newdata = testing)
qplot(x = wage, y = pred, colour=year, data = testing)
The basic idea: if you have a bunch of variables that you want to use to predict an outcome, you can take each of those variables and use them to split the outcome into different groups. Then, you can evaluate the homogeneity of each group, and continue to seperate into groups until the groups are homogeneous enough, or small enough, to stop. While this approach is easy to interpret, and gives better performance in non-linear settings, without pruning or cross validatation, it can lead to uncertainty, which is harder to estimate, and the results may be variable.
Basic algorithm:
Example: Predicting the species with the other variables in the Iris data set
As always, begin with data obtention, splitting and exda
data(iris)
names(iris)
## [1] "Sepal.Length" "Sepal.Width" "Petal.Length" "Petal.Width"
## [5] "Species"
table(iris$Species)
##
## setosa versicolor virginica
## 50 50 50
set.seed(333)
inTrain<-createDataPartition(y = iris$Species, p = 0.7, list=F)
training<-iris[inTrain, ]
testing<-iris[-inTrain, ]
c(dim(training), dim(testing))
## [1] 105 5 45 5
Look for clusters. Note that a linear model may not be appropriate here
qplot(x = Petal.Width, y = Sepal.Width, data = training, colour = Species)
You do not have to implement the tree prediction algorithm. Simply call the method “rpart”
modFit<-train(Species ~., method = "rpart", data = training)
## Loading required package: rpart
modFit$finalModel
## n= 105
##
## node), split, n, loss, yval, (yprob)
## * denotes terminal node
##
## 1) root 105 70 setosa (0.3333 0.3333 0.3333)
## 2) Petal.Length< 2.45 35 0 setosa (1.0000 0.0000 0.0000) *
## 3) Petal.Length>=2.45 70 35 versicolor (0.0000 0.5000 0.5000)
## 6) Petal.Length< 4.95 40 5 versicolor (0.0000 0.8750 0.1250) *
## 7) Petal.Length>=4.95 30 0 virginica (0.0000 0.0000 1.0000) *
Interpreting results: The final model tells you what all the nodes are, how they are split, and the probability of being in each class. For example, in the second line, Petal.Length<2.6, all of the examples with a petal length of 2.6 belong to the species Setosa.
Plotting the tree
plot(x = modFit$finalModel, uniform = T, main = "Classification Tree")
text(modFit$finalModel, use.n=T, all =T, cex= 0.8)
A prettier plot
require(rattle)
## Loading required package: rattle
## Rattle: A free graphical interface for data mining with R.
## Version 3.3.0 Copyright (c) 2006-2014 Togaware Pty Ltd.
## Type 'rattle()' to shake, rattle, and roll your data.
fancyRpartPlot(modFit$finalModel)
Remember, ultimately we need to predict values. Use predict()
predict(object = modFit, newdata = testing)
## [1] setosa setosa setosa setosa setosa setosa
## [7] setosa setosa setosa setosa setosa setosa
## [13] setosa setosa setosa versicolor versicolor versicolor
## [19] versicolor versicolor versicolor versicolor versicolor versicolor
## [25] virginica versicolor virginica versicolor versicolor versicolor
## [31] virginica virginica virginica virginica virginica virginica
## [37] virginica versicolor virginica virginica virginica virginica
## [43] virginica virginica virginica
## Levels: setosa versicolor virginica
The basic idea is that when you fit complicated models, when you average those models together, you get a smoother fit that gives you a better balance between balance in your fit and bias.
This leads to a similar bias from fitting any one of those model individually, but a reduced variability, because you have averaged those models together. This is most useful for non-linear functions
example: Predicting temperature as a function of ozone
library(ElemStatLearn)
##
## Attaching package: 'ElemStatLearn'
##
## The following object is masked _by_ '.GlobalEnv':
##
## spam
data(ozone, package = "ElemStatLearn")
ozone<-ozone[order(ozone$ozone),]
head(ozone)
## ozone radiation temperature wind
## 17 1 8 59 9.7
## 19 4 25 61 9.7
## 14 6 78 57 18.4
## 45 7 48 80 14.3
## 106 7 49 69 10.3
## 7 8 19 61 20.1
The procedure is as follows: Build a bootstrap sample of the data set (ten times), and use this to build a new data set called ozone0, and fit a loess (type of smooth curve) relating temperature to the ozone, wtih a common span. Then predict for every single loess curve an outcome for a new data set for the exact same values.
# construct a matrix to store predictions
ll<-matrix(NA, nrow = 10, ncol = 155)
for (i in (1:nrow(ll))){
# sample from the oxone data set with replcaement
ss<-sample(1:dim(ozone)[1], replace = T)
# assign the sample to a new data set
ozone0<-ozone[ss,]
# order by ozone variable
ozone0<-ozone0[order(ozone0$ozone),]
# fit loess
loess0<-loess(formula = temperature ~ ozone, data = ozone0, span = 0.2)
# predict and assign
ll[i,]<-predict(object = loess0, newdata = data.frame(ozone[1:155,]))
}
Visualization and interpretation
# plot points
plot(x = ozone$ozone, y = ozone$temperature, pch =19, cex =0.5)
We want to show the fit with one resampled data set. These are given by the grey lines below
plot(x = ozone$ozone, y = ozone$temperature, pch =19, cex =0.5)
# construct fits
for (i in (1:10)){lines(1:155, ll[i,], col="grey", lwd=2)}
# add fits to plot
lines(x = 1:155, apply(ll, 2, mean), col ="red", lwd=2)
The red line is the bagges loess curve.
Bagging in carret is accomplished by setting the method to be bagEarth, treebag, or bagFDA, or you can build your own bagging function.
example: bagging from regression trees
Take your predictor variable an put it into one data frame.
predictors<-data.frame(ozone=ozone$ozone)
head(predictors)
## ozone
## 1 1
## 2 4
## 3 6
## 4 7
## 5 7
## 6 8
Then, obtain the outcome variable
temperature<-ozone$temperature
Pass these as arguments to the bag() function in the caret package, with the number of subsamples B, and the list of options you want for how to fit the model in bagcontrol
treebag<-bag(x = predictors,
y = temperature, B = 10,
bagControl = bagControl(fit = ctreeBag$fit,
predict = ctreeBag$pred,
aggregate = ctreeBag$aggregate))
## Loading required package: grid
## Loading required package: zoo
##
## Attaching package: 'zoo'
##
## The following objects are masked from 'package:base':
##
## as.Date, as.Date.numeric
##
## Loading required package: sandwich
## Loading required package: strucchange
## Loading required package: modeltools
## Loading required package: stats4
plot(ozone$ozone, ozone$temperature, col="lightgrey", pch=19)
points(ozone$ozone, predict(treebag$fits[[1]]$fit, predictors), pch=19, col="red")
points(ozone$ozone, predict(treebag, predictors), pch=19, col="blue")
The grey dots represent the observed values; the red dots are fits from a single conditional regression tree. Note how these do not capture the upward trend very well. Averaging over ten model fits with these regression trees shows the trend (blue)
An extension of bagging. We take a resample of the data set, and at each split, we also bootstrap the variables. It allows us to grow a large number of trees. It is very accurate, but it it slow, and can be difficult to interpret
example Iris data
set.seed(333)
inTrain<-createDataPartition(y = iris$Species, p = 0.7, list = F)
training<-iris[inTrain, ]
testing<-iris[-inTrain, ]
modFit<-train(Species~., method = "rf", data = training, proximity = T)
## Loading required package: randomForest
## randomForest 4.6-10
## Type rfNews() to see new features/changes/bug fixes.
modFit
## Random Forest
##
## 105 samples
## 4 predictor
## 3 classes: 'setosa', 'versicolor', 'virginica'
##
## No pre-processing
## Resampling: Bootstrapped (25 reps)
##
## Summary of sample sizes: 105, 105, 105, 105, 105, 105, ...
##
## Resampling results across tuning parameters:
##
## mtry Accuracy Kappa Accuracy SD Kappa SD
## 2 1 0.9 0.04 0.06
## 3 1 0.9 0.04 0.07
## 4 1 0.9 0.04 0.07
##
## Accuracy was used to select the optimal model using the largest value.
## The final value used for the model was mtry = 2.
Obtain the second tree that was produced with getTree()
getTree(modFit$finalModel, k = 2)
## left daughter right daughter split var split point status prediction
## 1 2 3 3 4.85 1 0
## 2 4 5 3 2.60 1 0
## 3 6 7 3 4.95 1 0
## 4 0 0 0 0.00 -1 1
## 5 8 9 3 4.70 1 0
## 6 10 11 1 6.60 1 0
## 7 0 0 0 0.00 -1 3
## 8 0 0 0 0.00 -1 2
## 9 12 13 4 1.60 1 0
## 10 0 0 0 0.00 -1 3
## 11 0 0 0 0.00 -1 2
## 12 0 0 0 0.00 -1 2
## 13 14 15 1 6.05 1 0
## 14 0 0 0 0.00 -1 2
## 15 0 0 0 0.00 -1 3
Obtain the center predictions of the classes
irisP<-as.data.frame(
classCenter(x = training[, c(3, 4)],
label = training$Species,
prox = modFit$finalModel$proximity)
)
irisP$Species<-rownames(irisP)
p<-qplot(x = Petal.Width, y = Petal.Length, data = training, col = Species)
p +geom_point(aes(x = Petal.Width, y = Petal.Length, col=Species), size = 5, shape =4, data = irisP)
Predict new values
pred<-predict(object = modFit, newdata = testing)
# check to see if the prediction is correct
predRight<-pred == testing$Species
table(pred, testing$Species)
##
## pred setosa versicolor virginica
## setosa 15 0 0
## versicolor 0 13 0
## virginica 0 2 15
We missed two; to see which these were,
qplot(Petal.Width, Petal.Length, colour = predRight, data = testing)
Take a large number of possibly weak predictors, weight them and add them up to get a stronger predictor. We could take all possible trees, or all possible regression models, etc.
Example: Wage
data(Wage)
Wage<-subset(Wage, select = -c(logwage))
set.seed(333)
inTrain<-createDataPartition(y = iris$Species, p = 0.7, list = F)
training<-Wage[inTrain,]
testing<-Wage[-inTrain, ]
# use boosting with trees, and suppress output with verbose = F
modFit<-train(wage~., method = "gbm", data=training, verbose =F)
modFit
## Stochastic Gradient Boosting
##
## 105 samples
## 10 predictor
##
## No pre-processing
## Resampling: Bootstrapped (25 reps)
##
## Summary of sample sizes: 105, 105, 105, 105, 105, 105, ...
##
## Resampling results across tuning parameters:
##
## interaction.depth n.trees RMSE Rsquared RMSE SD Rsquared SD
## 1 50 36 0.3 5 0.08
## 1 100 38 0.3 5 0.09
## 1 150 38 0.3 5 0.10
## 2 50 37 0.3 5 0.09
## 2 100 38 0.3 5 0.10
## 2 150 38 0.3 5 0.10
## 3 50 37 0.3 5 0.09
## 3 100 38 0.3 5 0.10
## 3 150 38 0.3 5 0.10
##
## Tuning parameter 'shrinkage' was held constant at a value of 0.1
## RMSE was used to select the optimal model using the smallest value.
## The final values used for the model were n.trees = 50, interaction.depth
## = 1 and shrinkage = 0.1.
Visualizing the results
qplot(x = predict(object = modFit, newdata = testing), y = wage, data = testing)